Import Libraries¶

In [3]:
# Data Cleaning & Transformation
import pandas as pd
import datatable as dt
pd.set_option('display.max.columns', 500)
pd.set_option('display.max.columns', 500)
import numpy as np

# Data Understanding
from ydata_profiling import ProfileReport

# Data Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Machine Learning
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Hyperparameters Optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Ensemble Learning
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

Function Purpose: Due to the large number of records especially after one-hot encoding, a way to reduce the memory is to ensure that the data type nicely fits the min-max of the variables

In [4]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:        #-128 to 128
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:    #-32768 to 32768
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:    #-2147483648 to 2147483648
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:    #-9223372036854775808 to 9223372036854775808
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max  #-65500 to 65500
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max  #-3.40e+38 to -3.40e+38
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)                                   #-1.80e+308 to 1.80e+308
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

📝Data Overview¶

In [5]:
# train_df = dt.fread("data/Loan_status_2007-2020Q3.csv").to_pandas()
train_df = pd.read_csv("data/Loan_status_2007-2020Q3.csv")
train_df
Out[5]:
id loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies
0 1077501 5000.0 36 months 10.65% 162.87 B B2 NaN 10+ years RENT 24000.0 Verified Dec-2011 Fully Paid credit_card Computer 860xx 27.65 Jan-1985 3.0 0.0 13648.0 83.7% 9.0 f Individual NaN 0.0
1 1077430 2500.0 60 months 15.27% 59.83 C C4 Ryder < 1 year RENT 30000.0 Source Verified Dec-2011 Charged Off car bike 309xx 1.00 Apr-1999 3.0 0.0 1687.0 9.4% 4.0 f Individual NaN 0.0
2 1077175 2400.0 36 months 15.96% 84.33 C C5 NaN 10+ years RENT 12252.0 Not Verified Dec-2011 Fully Paid small_business real estate business 606xx 8.72 Nov-2001 2.0 0.0 2956.0 98.5% 10.0 f Individual NaN 0.0
3 1076863 10000.0 36 months 13.49% 339.31 C C1 AIR RESOURCES BOARD 10+ years RENT 49200.0 Source Verified Dec-2011 Fully Paid other personel 917xx 20.00 Feb-1996 10.0 0.0 5598.0 21% 37.0 f Individual NaN 0.0
4 1075358 3000.0 60 months 12.69% 67.79 B B5 University Medical Group 1 year RENT 80000.0 Source Verified Dec-2011 Fully Paid other Personal 972xx 17.94 Jan-1996 15.0 0.0 27783.0 53.9% 38.0 f Individual NaN 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925488 102556443 24000.0 60 months 23.99% 690.30 E E2 Senior Escrow Officer < 1 year RENT 107000.0 Source Verified Apr-2017 Charged Off other Other 801xx 11.65 Apr-1995 13.0 2.0 9688.0 24.9% 24.0 f Individual 0.0 1.0
2925489 102653304 10000.0 36 months 7.99% 313.32 A A5 Rn 10+ years MORTGAGE 65000.0 Source Verified Apr-2017 Fully Paid debt_consolidation Debt consolidation 187xx 19.55 Sep-1993 15.0 0.0 9751.0 15.7% 27.0 w Individual 0.0 0.0
2925490 102628603 10050.0 36 months 16.99% 358.26 D D1 Sales Associate 8 years RENT 37000.0 Not Verified Apr-2017 Charged Off debt_consolidation Debt consolidation 220xx 20.56 May-1993 15.0 0.0 14300.0 47% 21.0 w Individual 0.0 0.0
2925491 102196576 6000.0 36 months 11.44% 197.69 B B4 Contact input 5 years RENT 41000.0 Source Verified Apr-2017 Fully Paid credit_card Credit card refinancing 105xx 19.99 May-1990 9.0 0.0 1356.0 10.1% 18.0 w Individual 0.0 0.0
2925492 99799684 30000.0 60 months 25.49% 889.18 E E4 Assistant Manager 4 years MORTGAGE 105700.0 Verified Apr-2017 Charged Off debt_consolidation Debt consolidation 797xx 27.26 Nov-1997 12.0 0.0 15252.0 72.6% 23.0 w Individual 3.0 0.0

2925493 rows × 28 columns

Use Pandas Profile Report for EDA¶

Note: Not ideal for datasets with too many columns and rows

In [41]:
# profile = ProfileReport(train_df)
# profile.to_notebook_iframe()

Data Type Assessment¶

In [42]:
train_df.info(verbose=True, show_counts=True)
# verbose: Whether to print the full summary
# show_counts: Whether to show the non-null counts. By default, this is shown only if the DataFrame is small
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2925493 entries, 0 to 2925492
Data columns (total 28 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    2925493 non-null  object 
 1   loan_amnt             2925492 non-null  float64
 2   term                  2925492 non-null  object 
 3   int_rate              2925492 non-null  object 
 4   installment           2925492 non-null  float64
 5   grade                 2925492 non-null  object 
 6   sub_grade             2925492 non-null  object 
 7   emp_title             2661406 non-null  object 
 8   emp_length            2720272 non-null  object 
 9   home_ownership        2925492 non-null  object 
 10  annual_inc            2925488 non-null  float64
 11  verification_status   2925492 non-null  object 
 12  issue_d               2925492 non-null  object 
 13  loan_status           2925492 non-null  object 
 14  purpose               2925492 non-null  object 
 15  title                 2902167 non-null  object 
 16  zip_code              2925491 non-null  object 
 17  dti                   2922384 non-null  float64
 18  earliest_cr_line      2925463 non-null  object 
 19  open_acc              2925463 non-null  float64
 20  pub_rec               2925463 non-null  float64
 21  revol_bal             2925492 non-null  float64
 22  revol_util            2922832 non-null  object 
 23  total_acc             2925463 non-null  float64
 24  initial_list_status   2925492 non-null  object 
 25  application_type      2925492 non-null  object 
 26  mort_acc              2875462 non-null  float64
 27  pub_rec_bankruptcies  2924127 non-null  float64
dtypes: float64(10), object(18)
memory usage: 625.0+ MB

Data Type </br>int_rate, issue_d, earliest_cr_line & revol_util have incorrect data types

</br> Missing Values </br>Almost all columns except id have missing values. </br> It is not easy to find out the number or percentage of missing values in each column, hence using for loop for better illustration

In [43]:
full = []
for col in train_df.columns.tolist():
    if train_df[col].isnull().sum() != 0:
        total = train_df[col].isnull().sum()
        percent = round(train_df[col].isnull().sum()/len(train_df)*100, 2)
        ls = [col, total, percent]
        full.append(ls)
    else:
        pass

missing = pd.DataFrame(full, columns = ['Variable', 'No. of Missing Values', 'Percent'])
missing
Out[43]:
Variable No. of Missing Values Percent
0 loan_amnt 1 0.00
1 term 1 0.00
2 int_rate 1 0.00
3 installment 1 0.00
4 grade 1 0.00
5 sub_grade 1 0.00
6 emp_title 264087 9.03
7 emp_length 205221 7.01
8 home_ownership 1 0.00
9 annual_inc 5 0.00
10 verification_status 1 0.00
11 issue_d 1 0.00
12 loan_status 1 0.00
13 purpose 1 0.00
14 title 23326 0.80
15 zip_code 2 0.00
16 dti 3109 0.11
17 earliest_cr_line 30 0.00
18 open_acc 30 0.00
19 pub_rec 30 0.00
20 revol_bal 1 0.00
21 revol_util 2661 0.09
22 total_acc 30 0.00
23 initial_list_status 1 0.00
24 application_type 1 0.00
25 mort_acc 50031 1.71
26 pub_rec_bankruptcies 1366 0.05
  • All the columns have less than 10% missing values
  • Notably, emp_title & emp_length have the highest number of missing values. emp_title may be difficult to impute due to its high cardinality by nature
In [44]:
# Categorical Variables
train_df.describe(include = 'object').T
Out[44]:
count unique top freq
id 2925493 2925493 1077501 1
term 2925492 2 36 months 2060077
int_rate 2925492 704 8.19% 68199
grade 2925492 7 B 857573
sub_grade 2925492 35 B4 185188
emp_title 2661406 590413 Teacher 50103
emp_length 2720272 11 10+ years 946268
home_ownership 2925492 6 MORTGAGE 1437859
verification_status 2925492 3 Source Verified 1143247
issue_d 2925492 160 Mar-2016 61992
loan_status 2925492 10 Fully Paid 1497783
purpose 2925492 14 debt_consolidation 1638058
title 2902167 63155 Debt consolidation 1513474
zip_code 2925491 962 112xx 30473
earliest_cr_line 2925463 777 Sep-2005 20089
revol_util 2922832 1443 0% 18856
initial_list_status 2925492 2 w 2139434
application_type 2925492 2 Individual 2714979
In [45]:
# Numeric Variables
train_df.describe().T
Out[45]:
count mean std min 25% 50% 75% max
loan_amnt 2925492.0 15358.775105 9478.383451 500.00 8000.00 13000.0 20000.00 4.000000e+04
installment 2925492.0 452.391358 272.947849 4.93 253.50 382.3 603.92 1.719830e+03
annual_inc 2925488.0 79937.274015 111747.608711 0.00 47000.00 66000.0 95000.00 1.100000e+08
dti 2922384.0 19.296480 15.719773 -1.00 12.08 18.1 24.88 9.990000e+02
open_acc 2925463.0 11.676163 5.733312 0.00 8.00 11.0 15.00 1.040000e+02
pub_rec 2925463.0 0.176285 0.524376 0.00 0.00 0.0 0.00 8.600000e+01
revol_bal 2925492.0 16964.840934 22996.213652 0.00 5993.00 11493.0 20644.00 2.904836e+06
total_acc 2925463.0 24.018350 12.076252 1.00 15.00 22.0 31.00 1.760000e+02
mort_acc 2875462.0 1.512336 1.869776 0.00 0.00 1.0 2.00 9.400000e+01
pub_rec_bankruptcies 2924127.0 0.122679 0.353141 0.00 0.00 0.0 0.00 1.200000e+01

Duplicated Records?¶

In [46]:
if len(train_df[train_df.duplicated('id', keep = False) == True]) == 0:
    print('No Duplicates')
else:
    raise Exception('Duplicated ID records')
No Duplicates

Data Cleaning¶

  • Reformat data types ✅
  • Address missing values ✅
  • Remove Duplicates ❌
  • Remove irrevalant features ✅
  • Filter unwanted outliers ✅ ## Changing Data Types
In [47]:
train_df[train_df['int_rate'].isnull() == True]
Out[47]:
id loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies
39786 Loans that do not meet the credit policy NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [48]:
# Convert int_rate from str to float
train_df2 = train_df.copy()
train_df2['int_rate'] = train_df2['int_rate'].str[:-1]
train_df2['int_rate'] = train_df2['int_rate'].astype(float)
train_df2
Out[48]:
id loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies
0 1077501 5000.0 36 months 10.65 162.87 B B2 NaN 10+ years RENT 24000.0 Verified Dec-2011 Fully Paid credit_card Computer 860xx 27.65 Jan-1985 3.0 0.0 13648.0 83.7% 9.0 f Individual NaN 0.0
1 1077430 2500.0 60 months 15.27 59.83 C C4 Ryder < 1 year RENT 30000.0 Source Verified Dec-2011 Charged Off car bike 309xx 1.00 Apr-1999 3.0 0.0 1687.0 9.4% 4.0 f Individual NaN 0.0
2 1077175 2400.0 36 months 15.96 84.33 C C5 NaN 10+ years RENT 12252.0 Not Verified Dec-2011 Fully Paid small_business real estate business 606xx 8.72 Nov-2001 2.0 0.0 2956.0 98.5% 10.0 f Individual NaN 0.0
3 1076863 10000.0 36 months 13.49 339.31 C C1 AIR RESOURCES BOARD 10+ years RENT 49200.0 Source Verified Dec-2011 Fully Paid other personel 917xx 20.00 Feb-1996 10.0 0.0 5598.0 21% 37.0 f Individual NaN 0.0
4 1075358 3000.0 60 months 12.69 67.79 B B5 University Medical Group 1 year RENT 80000.0 Source Verified Dec-2011 Fully Paid other Personal 972xx 17.94 Jan-1996 15.0 0.0 27783.0 53.9% 38.0 f Individual NaN 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925488 102556443 24000.0 60 months 23.99 690.30 E E2 Senior Escrow Officer < 1 year RENT 107000.0 Source Verified Apr-2017 Charged Off other Other 801xx 11.65 Apr-1995 13.0 2.0 9688.0 24.9% 24.0 f Individual 0.0 1.0
2925489 102653304 10000.0 36 months 7.99 313.32 A A5 Rn 10+ years MORTGAGE 65000.0 Source Verified Apr-2017 Fully Paid debt_consolidation Debt consolidation 187xx 19.55 Sep-1993 15.0 0.0 9751.0 15.7% 27.0 w Individual 0.0 0.0
2925490 102628603 10050.0 36 months 16.99 358.26 D D1 Sales Associate 8 years RENT 37000.0 Not Verified Apr-2017 Charged Off debt_consolidation Debt consolidation 220xx 20.56 May-1993 15.0 0.0 14300.0 47% 21.0 w Individual 0.0 0.0
2925491 102196576 6000.0 36 months 11.44 197.69 B B4 Contact input 5 years RENT 41000.0 Source Verified Apr-2017 Fully Paid credit_card Credit card refinancing 105xx 19.99 May-1990 9.0 0.0 1356.0 10.1% 18.0 w Individual 0.0 0.0
2925492 99799684 30000.0 60 months 25.49 889.18 E E4 Assistant Manager 4 years MORTGAGE 105700.0 Verified Apr-2017 Charged Off debt_consolidation Debt consolidation 797xx 27.26 Nov-1997 12.0 0.0 15252.0 72.6% 23.0 w Individual 3.0 0.0

2925493 rows × 28 columns

In [49]:
# Convert revol_util from str to float
train_df2['revol_util'] = train_df['revol_util'].str[:-1]
train_df2['revol_util'] = train_df2['revol_util'].astype(float)
train_df2['revol_util'].dtype
Out[49]:
dtype('float64')
In [50]:
# Convert earliest_cr_line from str to datetime
train_df2['earliest_cr_line'] = pd.to_datetime(train_df['earliest_cr_line'], format = '%b-%Y')
train_df2['earliest_cr_line']
Out[50]:
0         1985-01-01
1         1999-04-01
2         2001-11-01
3         1996-02-01
4         1996-01-01
             ...    
2925488   1995-04-01
2925489   1993-09-01
2925490   1993-05-01
2925491   1990-05-01
2925492   1997-11-01
Name: earliest_cr_line, Length: 2925493, dtype: datetime64[ns]
In [51]:
# Convert issue_d from str to datetime
train_df2['issue_d'] = pd.to_datetime(train_df['issue_d'], format = '%b-%Y').dt.date
train_df2['issue_d']
Out[51]:
0          2011-12-01
1          2011-12-01
2          2011-12-01
3          2011-12-01
4          2011-12-01
              ...    
2925488    2017-04-01
2925489    2017-04-01
2925490    2017-04-01
2925491    2017-04-01
2925492    2017-04-01
Name: issue_d, Length: 2925493, dtype: object

Handling Zeros (For columns not supposed to have 0)¶

In [52]:
numeric = train_df2.select_dtypes(exclude = 'object')
numeric
Out[52]:
loan_amnt int_rate installment annual_inc dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc mort_acc pub_rec_bankruptcies
0 5000.0 10.65 162.87 24000.0 27.65 1985-01-01 3.0 0.0 13648.0 83.7 9.0 NaN 0.0
1 2500.0 15.27 59.83 30000.0 1.00 1999-04-01 3.0 0.0 1687.0 9.4 4.0 NaN 0.0
2 2400.0 15.96 84.33 12252.0 8.72 2001-11-01 2.0 0.0 2956.0 98.5 10.0 NaN 0.0
3 10000.0 13.49 339.31 49200.0 20.00 1996-02-01 10.0 0.0 5598.0 21.0 37.0 NaN 0.0
4 3000.0 12.69 67.79 80000.0 17.94 1996-01-01 15.0 0.0 27783.0 53.9 38.0 NaN 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925488 24000.0 23.99 690.30 107000.0 11.65 1995-04-01 13.0 2.0 9688.0 24.9 24.0 0.0 1.0
2925489 10000.0 7.99 313.32 65000.0 19.55 1993-09-01 15.0 0.0 9751.0 15.7 27.0 0.0 0.0
2925490 10050.0 16.99 358.26 37000.0 20.56 1993-05-01 15.0 0.0 14300.0 47.0 21.0 0.0 0.0
2925491 6000.0 11.44 197.69 41000.0 19.99 1990-05-01 9.0 0.0 1356.0 10.1 18.0 0.0 0.0
2925492 30000.0 25.49 889.18 105700.0 27.26 1997-11-01 12.0 0.0 15252.0 72.6 23.0 3.0 0.0

2925493 rows × 13 columns

In [53]:
full = []
for col in numeric.columns.tolist():
    if (numeric[col]==0).sum() != 0:
        total = (numeric[col]==0).sum()
        percent = round((numeric[col]==0).sum()/len(numeric)*100, 2)
        ls = [col, total, percent]
        full.append(ls)
    else:
        pass

numeric_df = pd.DataFrame(full, columns = ['Variable', 'No. of 0s', 'Percent'])
numeric_df
Out[53]:
Variable No. of 0s Percent
0 annual_inc 3054 0.10
1 dti 2353 0.08
2 open_acc 92 0.00
3 pub_rec 2499290 85.43
4 revol_bal 18099 0.62
5 revol_util 18856 0.64
6 mort_acc 1233212 42.15
7 pub_rec_bankruptcies 2583966 88.33
  • annual_inc: Possible to be 0 as there could be borrower who has no income but just want to borrow money. However do ensure that their dti is either NaN or 0
  • dti: Can be 0 as there may be borrower with no debt
  • The rest are account-based variables, hence reasonable to be 0
In [54]:
# Check to ensure all records with annual income as 0 have their dti as 0 or NaN
if len(numeric[(numeric['annual_inc'] == 0) & (~numeric['dti'].isin([np.nan, 0]))]) == 0:
    pass
else:
    raise Exception('Pls rectify')

Handling Missing Values¶

Columns with more than 70% missing values are said to not provide any useful information for the model to learn
In [55]:
# Remove any columns with more than 70% missing values
train_df2a = train_df2[train_df2.columns[train_df2.isnull().mean() < 0.7]]
print('Num of columns removed:', len(train_df2) - len(train_df2a))
Num of columns removed: 0
In [56]:
# Remove any rows with more than 70% missing values
train_df2b = train_df2[train_df2.isnull().mean(axis = 1) < 0.7]
print('Num of rows removed:', len(train_df2a) - len(train_df2b))
Num of rows removed: 1
In [57]:
full = []
for col in train_df2b.columns.tolist():
    if train_df2b[col].isnull().sum() != 0:
        total = train_df2b[col].isnull().sum()
        percent = round(train_df2b[col].isnull().sum()/len(train_df2b)*100, 2)
        ls = [col, total, percent]
        full.append(ls)
    else:
        pass

missing = pd.DataFrame(full, columns = ['Variable', 'No. of Missing Values', 'Percent'])
missing
Out[57]:
Variable No. of Missing Values Percent
0 emp_title 264086 9.03
1 emp_length 205220 7.01
2 annual_inc 4 0.00
3 title 23325 0.80
4 zip_code 1 0.00
5 dti 3108 0.11
6 earliest_cr_line 29 0.00
7 open_acc 29 0.00
8 pub_rec 29 0.00
9 revol_util 2660 0.09
10 total_acc 29 0.00
11 mort_acc 50030 1.71
12 pub_rec_bankruptcies 1365 0.05

Handling Missing Values for Numeric Variables¶

In [58]:
missing[missing['Variable'].isin(train_df2b.select_dtypes(exclude = 'object'))]
Out[58]:
Variable No. of Missing Values Percent
2 annual_inc 4 0.00
5 dti 3108 0.11
6 earliest_cr_line 29 0.00
7 open_acc 29 0.00
8 pub_rec 29 0.00
9 revol_util 2660 0.09
10 total_acc 29 0.00
11 mort_acc 50030 1.71
12 pub_rec_bankruptcies 1365 0.05

Imputation of columns with missing values </u>
revol_util, mort_acc, annual_inc, pub_rec_bankruptcies are numeric variables, but are right-skewed. Hence to impute using median
dti can be imputed using mean() as it is not skewed

In [59]:
train_df3 = train_df2b.copy()
train_df3['annual_inc'] = train_df3['annual_inc'].fillna(train_df3['annual_inc'].median())
train_df3['revol_util'] = train_df3['revol_util'].fillna(train_df3['revol_util'].median())
train_df3['mort_acc'] = train_df3['mort_acc'].fillna(train_df3['mort_acc'].median())
train_df3['pub_rec_bankruptcies'] = train_df3['pub_rec_bankruptcies'].fillna(train_df3['pub_rec_bankruptcies'].median())

train_df3['dti'] = train_df3['dti'].fillna(train_df3['dti'].mean())

Notice that 13 records have missing account-related variables. Hence decided to remove these 13 records

In [60]:
train_df3[['open_acc', 'earliest_cr_line', 'open_acc', 'pub_rec', 'total_acc']][train_df3['open_acc'].isna()]
Out[60]:
open_acc earliest_cr_line open_acc pub_rec total_acc
42450 NaN NaT NaN NaN NaN
42451 NaN NaT NaN NaN NaN
42460 NaN NaT NaN NaN NaN
42473 NaN NaT NaN NaN NaN
42481 NaN NaT NaN NaN NaN
42484 NaN NaT NaN NaN NaN
42495 NaN NaT NaN NaN NaN
42510 NaN NaT NaN NaN NaN
42515 NaN NaT NaN NaN NaN
42516 NaN NaT NaN NaN NaN
42517 NaN NaT NaN NaN NaN
42518 NaN NaT NaN NaN NaN
42519 NaN NaT NaN NaN NaN
42520 NaN NaT NaN NaN NaN
42521 NaN NaT NaN NaN NaN
42522 NaN NaT NaN NaN NaN
42523 NaN NaT NaN NaN NaN
42524 NaN NaT NaN NaN NaN
42525 NaN NaT NaN NaN NaN
42526 NaN NaT NaN NaN NaN
42527 NaN NaT NaN NaN NaN
42528 NaN NaT NaN NaN NaN
42529 NaN NaT NaN NaN NaN
42530 NaN NaT NaN NaN NaN
42531 NaN NaT NaN NaN NaN
42532 NaN NaT NaN NaN NaN
42533 NaN NaT NaN NaN NaN
42534 NaN NaT NaN NaN NaN
42535 NaN NaT NaN NaN NaN
In [61]:
train_df3 = train_df3[~train_df3['open_acc'].isna()]
train_df3.select_dtypes(exclude = 'object').isna().sum()
Out[61]:
loan_amnt               0
int_rate                0
installment             0
annual_inc              0
dti                     0
earliest_cr_line        0
open_acc                0
pub_rec                 0
revol_bal               0
revol_util              0
total_acc               0
mort_acc                0
pub_rec_bankruptcies    0
dtype: int64

Handling Missing Values for Categorical Variables¶

In [62]:
missing[missing['Variable'].isin(train_df3.select_dtypes(include = 'object'))]
Out[62]:
Variable No. of Missing Values Percent
0 emp_title 264086 9.03
1 emp_length 205220 7.01
3 title 23325 0.80
4 zip_code 1 0.00
In [63]:
train_df3['emp_length'] = train_df3['emp_length'].fillna(train_df3['emp_length'].mode())

Removing Unnecessary Features¶

In [64]:
train_df4 = train_df3.drop(['issue_d', 'id'], axis = 1)
train_df4.sample(3)
Out[64]:
loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies
2639237 26000.0 36 months 19.99 966.13 D D4 Business development director 5 years MORTGAGE 92000.0 Verified Fully Paid debt_consolidation Debt consolidation 711xx 15.29 2003-11-01 6.0 0.0 24199.0 83.4 15.0 f Individual 2.0 0.0
1080527 10000.0 36 months 16.14 352.27 C C4 Valet Supervisor 2 years RENT 38000.0 Not Verified Current debt_consolidation Debt consolidation 980xx 10.20 2010-02-01 4.0 0.0 11933.0 75.1 7.0 f Individual 0.0 0.0
1203866 18000.0 36 months 22.50 692.10 D D3 Occupational Therapist 10+ years OWN 60000.0 Verified Current credit_card Credit card refinancing 620xx 37.84 2001-05-01 8.0 0.0 28691.0 63.6 14.0 w Individual 1.0 0.0

issue_d may not be useful for prediction as this is done after the issuance of loan </br> id are removed as they are not predictor variable but identifiers

Distribution of Outcome Variable¶

  • Is it necessary to retain the cardinality of the outcome variable or can it be simplied to a binary classification? Also good to relook at the business problem again to answer this question
  • How is the distribution of the outcomes? Are they well represented?
In [65]:
loan_status_dict = train_df4['loan_status'].value_counts().to_dict()
plt.barh(list(loan_status_dict.keys()), list(loan_status_dict.values()))
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

for i, v in enumerate(train_df4['loan_status'].value_counts()):
    plt.text(v, i, str(v), fontsize = 12, ha='left')
    
plt.show()

Relabelling the Loan Status

From the above visualisation, we observed that 10 type of loan status exist in this data set. We are only interested in 2 status i.e. Defaulted and Not Defaulted. Hence, we will need to add a new variable which will be binary (0s and 1s).

  • 0 means Not Defaulted
  • 1 means Defaulted

All those loans, whose status is “Fully Paid”, “Current” will be categorized as Not Defaulted and anything else will be categorized as Defaulted. To achieve this we will introduce new variable defaulted.

In [66]:
# First we define the function
def change_loan_status(loan_status):
    if loan_status in ['Fully Paid', 'Current']:
        return 0
    else:
        return 1

# Next we apply the function
train_df4['loan_status'] = train_df4['loan_status'].apply(change_loan_status)
train_df4.head()
Out[66]:
loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies
0 5000.0 36 months 10.65 162.87 B B2 NaN 10+ years RENT 24000.0 Verified 0 credit_card Computer 860xx 27.65 1985-01-01 3.0 0.0 13648.0 83.7 9.0 f Individual 1.0 0.0
1 2500.0 60 months 15.27 59.83 C C4 Ryder < 1 year RENT 30000.0 Source Verified 1 car bike 309xx 1.00 1999-04-01 3.0 0.0 1687.0 9.4 4.0 f Individual 1.0 0.0
2 2400.0 36 months 15.96 84.33 C C5 NaN 10+ years RENT 12252.0 Not Verified 0 small_business real estate business 606xx 8.72 2001-11-01 2.0 0.0 2956.0 98.5 10.0 f Individual 1.0 0.0
3 10000.0 36 months 13.49 339.31 C C1 AIR RESOURCES BOARD 10+ years RENT 49200.0 Source Verified 0 other personel 917xx 20.00 1996-02-01 10.0 0.0 5598.0 21.0 37.0 f Individual 1.0 0.0
4 3000.0 60 months 12.69 67.79 B B5 University Medical Group 1 year RENT 80000.0 Source Verified 0 other Personal 972xx 17.94 1996-01-01 15.0 0.0 27783.0 53.9 38.0 f Individual 1.0 0.0

Once again, we apply the value_counts() to see the number of instances of each unique status in the loan_status data column. Now we can see that the loan status only has 2 category.

In [67]:
loanstatus = train_df4['loan_status'].value_counts()
sns.barplot(loanstatus.index, loanstatus.values)
Out[67]:
<AxesSubplot:>
Based on the bar chart above, it can be seen that the dataset is imbalanced with many more records which does not default as compared to its counterpart.

🔍Exploratory Data Analysis & Features Assessment¶

Determine the distribution of loan status for categorical features¶

In [68]:
train_df4.select_dtypes('object').columns
Out[68]:
Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'purpose', 'title', 'zip_code',
       'initial_list_status', 'application_type'],
      dtype='object')
In [69]:
for i in train_df4.term.unique():
    print(i)
    print(f"{train_df4[train_df4.term == i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
 36 months
0    0.884
1    0.116
Name: loan_status, dtype: float64
==========================================
 60 months
0    0.818
1    0.182
Name: loan_status, dtype: float64
==========================================
In [70]:
for i in train_df4.emp_length.unique():
    print(i)
    print(f"{train_df4[train_df4.emp_length == i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
10+ years
0    0.873
1    0.127
Name: loan_status, dtype: float64
==========================================
< 1 year
0    0.869
1    0.131
Name: loan_status, dtype: float64
==========================================
1 year
0    0.86
1    0.14
Name: loan_status, dtype: float64
==========================================
3 years
0    0.863
1    0.137
Name: loan_status, dtype: float64
==========================================
8 years
0    0.856
1    0.144
Name: loan_status, dtype: float64
==========================================
9 years
0    0.857
1    0.143
Name: loan_status, dtype: float64
==========================================
4 years
0    0.864
1    0.136
Name: loan_status, dtype: float64
==========================================
5 years
0    0.866
1    0.134
Name: loan_status, dtype: float64
==========================================
6 years
0    0.866
1    0.134
Name: loan_status, dtype: float64
==========================================
2 years
0    0.865
1    0.135
Name: loan_status, dtype: float64
==========================================
7 years
0    0.862
1    0.138
Name: loan_status, dtype: float64
==========================================
nan
Series([], Name: loan_status, dtype: float64)
==========================================

Distribution of loan status is similar across varying years of employment, hence to be removed

In [71]:
for i in train_df4.sort_values('sub_grade')['sub_grade'].unique():
    print(i)
    print(f"{train_df4[train_df4.sub_grade == i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
A1
0    0.976
1    0.024
Name: loan_status, dtype: float64
==========================================
A2
0    0.967
1    0.033
Name: loan_status, dtype: float64
==========================================
A3
0    0.96
1    0.04
Name: loan_status, dtype: float64
==========================================
A4
0    0.955
1    0.045
Name: loan_status, dtype: float64
==========================================
A5
0    0.941
1    0.059
Name: loan_status, dtype: float64
==========================================
B1
0    0.926
1    0.074
Name: loan_status, dtype: float64
==========================================
B2
0    0.918
1    0.082
Name: loan_status, dtype: float64
==========================================
B3
0    0.902
1    0.098
Name: loan_status, dtype: float64
==========================================
B4
0    0.897
1    0.103
Name: loan_status, dtype: float64
==========================================
B5
0    0.883
1    0.117
Name: loan_status, dtype: float64
==========================================
C1
0    0.864
1    0.136
Name: loan_status, dtype: float64
==========================================
C2
0    0.85
1    0.15
Name: loan_status, dtype: float64
==========================================
C3
0    0.841
1    0.159
Name: loan_status, dtype: float64
==========================================
C4
0    0.82
1    0.18
Name: loan_status, dtype: float64
==========================================
C5
0    0.818
1    0.182
Name: loan_status, dtype: float64
==========================================
D1
0    0.798
1    0.202
Name: loan_status, dtype: float64
==========================================
D2
0    0.809
1    0.191
Name: loan_status, dtype: float64
==========================================
D3
0    0.783
1    0.217
Name: loan_status, dtype: float64
==========================================
D4
0    0.754
1    0.246
Name: loan_status, dtype: float64
==========================================
D5
0    0.741
1    0.259
Name: loan_status, dtype: float64
==========================================
E1
0    0.695
1    0.305
Name: loan_status, dtype: float64
==========================================
E2
0    0.677
1    0.323
Name: loan_status, dtype: float64
==========================================
E3
0    0.666
1    0.334
Name: loan_status, dtype: float64
==========================================
E4
0    0.653
1    0.347
Name: loan_status, dtype: float64
==========================================
E5
0    0.649
1    0.351
Name: loan_status, dtype: float64
==========================================
F1
0    0.623
1    0.377
Name: loan_status, dtype: float64
==========================================
F2
0    0.592
1    0.408
Name: loan_status, dtype: float64
==========================================
F3
0    0.592
1    0.408
Name: loan_status, dtype: float64
==========================================
F4
0    0.563
1    0.437
Name: loan_status, dtype: float64
==========================================
F5
0    0.553
1    0.447
Name: loan_status, dtype: float64
==========================================
G1
0    0.572
1    0.428
Name: loan_status, dtype: float64
==========================================
G2
0    0.553
1    0.447
Name: loan_status, dtype: float64
==========================================
G3
0    0.535
1    0.465
Name: loan_status, dtype: float64
==========================================
G4
0    0.523
1    0.477
Name: loan_status, dtype: float64
==========================================
G5
0    0.503
1    0.497
Name: loan_status, dtype: float64
==========================================

Intuitively, borrower with better loan grade tends not to default as seen above where a larger proportion of records that does not defaults are in the higher loan grade


Interestingly, loan grade of similar character have seemingly same loan distribution. Hence to drop sub_grade and keep grade

In [72]:
for i in train_df4.home_ownership.unique():
    print(i)
    print(f"{train_df4[train_df4.home_ownership == i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
RENT
0    0.843
1    0.157
Name: loan_status, dtype: float64
==========================================
OWN
0    0.863
1    0.137
Name: loan_status, dtype: float64
==========================================
MORTGAGE
0    0.882
1    0.118
Name: loan_status, dtype: float64
==========================================
OTHER
0    0.643
1    0.357
Name: loan_status, dtype: float64
==========================================
NONE
0    0.843
1    0.157
Name: loan_status, dtype: float64
==========================================
ANY
0    0.91
1    0.09
Name: loan_status, dtype: float64
==========================================
In [73]:
for i in train_df4.verification_status.unique():
    print(i)
    print(f"{train_df4[train_df4.verification_status == i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
Verified
0    0.814
1    0.186
Name: loan_status, dtype: float64
==========================================
Source Verified
0    0.86
1    0.14
Name: loan_status, dtype: float64
==========================================
Not Verified
0    0.903
1    0.097
Name: loan_status, dtype: float64
==========================================
In [74]:
for i in train_df4.purpose.unique():
    print(i)
    print(f"{train_df4[train_df4.purpose== i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
credit_card
0    0.889
1    0.111
Name: loan_status, dtype: float64
==========================================
car
0    0.895
1    0.105
Name: loan_status, dtype: float64
==========================================
small_business
0    0.786
1    0.214
Name: loan_status, dtype: float64
==========================================
other
0    0.859
1    0.141
Name: loan_status, dtype: float64
==========================================
wedding
0    0.857
1    0.143
Name: loan_status, dtype: float64
==========================================
debt_consolidation
0    0.854
1    0.146
Name: loan_status, dtype: float64
==========================================
home_improvement
0    0.878
1    0.122
Name: loan_status, dtype: float64
==========================================
major_purchase
0    0.864
1    0.136
Name: loan_status, dtype: float64
==========================================
medical
0    0.857
1    0.143
Name: loan_status, dtype: float64
==========================================
moving
0    0.835
1    0.165
Name: loan_status, dtype: float64
==========================================
vacation
0    0.873
1    0.127
Name: loan_status, dtype: float64
==========================================
house
0    0.857
1    0.143
Name: loan_status, dtype: float64
==========================================
renewable_energy
0    0.833
1    0.167
Name: loan_status, dtype: float64
==========================================
educational
0    0.639
1    0.361
Name: loan_status, dtype: float64
==========================================

Distinctively, those loan for education is more likely to default based on the distribution

In [75]:
for i in train_df4.initial_list_status.unique():
    print(i)
    print(f"{train_df4[train_df4.initial_list_status== i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
f
0    0.83
1    0.17
Name: loan_status, dtype: float64
==========================================
w
0    0.877
1    0.123
Name: loan_status, dtype: float64
==========================================

Distribution of loan status is similar across varying different initial listing status of loan, hence to be removed

In [76]:
for i in train_df4.application_type.unique():
    print(i)
    print(f"{train_df4[train_df4.application_type== i].loan_status.value_counts(normalize=True).round(3)}")
    print('==========================================')
Individual
0    0.862
1    0.138
Name: loan_status, dtype: float64
==========================================
Joint App
0    0.897
1    0.103
Name: loan_status, dtype: float64
==========================================

Collinearity between numeric features¶

In [77]:
train_df4['loan_status'] = train_df4['loan_status'].apply(str)

corr = train_df4.corr().round(3)
fig = px.imshow(corr, color_continuous_scale = 'plasma', text_auto = True, aspect = 'auto')

fig.show()

From the heat map, we can deduce strong positive correlation (>=0.5) between different variables:

  • loan_amt ↔ installment
  • total_acc ↔ open_acc
  • pub_rec ↔ pub_rec_bankruptcies

There is an absence of strong negative correlation between variables

Collinearity between catgorical features¶

https://medium.com/@ritesh.110587/correlation-between-categorical-variables-63f6bd9bf2f7

https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

In [78]:
from itertools import product
import scipy.stats as ss
# Create 2 list of cat variables
col_cat1 = train_df4.select_dtypes(include = 'object').columns
col_cat2 = train_df4.select_dtypes(include = 'object').columns.sort_values()

cat_combi = list(product(col_cat1, col_cat2, repeat = 1))
cat_combi_1 = set(cat_combi) - set([('title', 'emp_title'), ('emp_title', 'title')])
cat_combi_1
Out[78]:
{('application_type', 'application_type'),
 ('application_type', 'emp_length'),
 ('application_type', 'emp_title'),
 ('application_type', 'grade'),
 ('application_type', 'home_ownership'),
 ('application_type', 'initial_list_status'),
 ('application_type', 'loan_status'),
 ('application_type', 'purpose'),
 ('application_type', 'sub_grade'),
 ('application_type', 'term'),
 ('application_type', 'title'),
 ('application_type', 'verification_status'),
 ('application_type', 'zip_code'),
 ('emp_length', 'application_type'),
 ('emp_length', 'emp_length'),
 ('emp_length', 'emp_title'),
 ('emp_length', 'grade'),
 ('emp_length', 'home_ownership'),
 ('emp_length', 'initial_list_status'),
 ('emp_length', 'loan_status'),
 ('emp_length', 'purpose'),
 ('emp_length', 'sub_grade'),
 ('emp_length', 'term'),
 ('emp_length', 'title'),
 ('emp_length', 'verification_status'),
 ('emp_length', 'zip_code'),
 ('emp_title', 'application_type'),
 ('emp_title', 'emp_length'),
 ('emp_title', 'emp_title'),
 ('emp_title', 'grade'),
 ('emp_title', 'home_ownership'),
 ('emp_title', 'initial_list_status'),
 ('emp_title', 'loan_status'),
 ('emp_title', 'purpose'),
 ('emp_title', 'sub_grade'),
 ('emp_title', 'term'),
 ('emp_title', 'verification_status'),
 ('emp_title', 'zip_code'),
 ('grade', 'application_type'),
 ('grade', 'emp_length'),
 ('grade', 'emp_title'),
 ('grade', 'grade'),
 ('grade', 'home_ownership'),
 ('grade', 'initial_list_status'),
 ('grade', 'loan_status'),
 ('grade', 'purpose'),
 ('grade', 'sub_grade'),
 ('grade', 'term'),
 ('grade', 'title'),
 ('grade', 'verification_status'),
 ('grade', 'zip_code'),
 ('home_ownership', 'application_type'),
 ('home_ownership', 'emp_length'),
 ('home_ownership', 'emp_title'),
 ('home_ownership', 'grade'),
 ('home_ownership', 'home_ownership'),
 ('home_ownership', 'initial_list_status'),
 ('home_ownership', 'loan_status'),
 ('home_ownership', 'purpose'),
 ('home_ownership', 'sub_grade'),
 ('home_ownership', 'term'),
 ('home_ownership', 'title'),
 ('home_ownership', 'verification_status'),
 ('home_ownership', 'zip_code'),
 ('initial_list_status', 'application_type'),
 ('initial_list_status', 'emp_length'),
 ('initial_list_status', 'emp_title'),
 ('initial_list_status', 'grade'),
 ('initial_list_status', 'home_ownership'),
 ('initial_list_status', 'initial_list_status'),
 ('initial_list_status', 'loan_status'),
 ('initial_list_status', 'purpose'),
 ('initial_list_status', 'sub_grade'),
 ('initial_list_status', 'term'),
 ('initial_list_status', 'title'),
 ('initial_list_status', 'verification_status'),
 ('initial_list_status', 'zip_code'),
 ('loan_status', 'application_type'),
 ('loan_status', 'emp_length'),
 ('loan_status', 'emp_title'),
 ('loan_status', 'grade'),
 ('loan_status', 'home_ownership'),
 ('loan_status', 'initial_list_status'),
 ('loan_status', 'loan_status'),
 ('loan_status', 'purpose'),
 ('loan_status', 'sub_grade'),
 ('loan_status', 'term'),
 ('loan_status', 'title'),
 ('loan_status', 'verification_status'),
 ('loan_status', 'zip_code'),
 ('purpose', 'application_type'),
 ('purpose', 'emp_length'),
 ('purpose', 'emp_title'),
 ('purpose', 'grade'),
 ('purpose', 'home_ownership'),
 ('purpose', 'initial_list_status'),
 ('purpose', 'loan_status'),
 ('purpose', 'purpose'),
 ('purpose', 'sub_grade'),
 ('purpose', 'term'),
 ('purpose', 'title'),
 ('purpose', 'verification_status'),
 ('purpose', 'zip_code'),
 ('sub_grade', 'application_type'),
 ('sub_grade', 'emp_length'),
 ('sub_grade', 'emp_title'),
 ('sub_grade', 'grade'),
 ('sub_grade', 'home_ownership'),
 ('sub_grade', 'initial_list_status'),
 ('sub_grade', 'loan_status'),
 ('sub_grade', 'purpose'),
 ('sub_grade', 'sub_grade'),
 ('sub_grade', 'term'),
 ('sub_grade', 'title'),
 ('sub_grade', 'verification_status'),
 ('sub_grade', 'zip_code'),
 ('term', 'application_type'),
 ('term', 'emp_length'),
 ('term', 'emp_title'),
 ('term', 'grade'),
 ('term', 'home_ownership'),
 ('term', 'initial_list_status'),
 ('term', 'loan_status'),
 ('term', 'purpose'),
 ('term', 'sub_grade'),
 ('term', 'term'),
 ('term', 'title'),
 ('term', 'verification_status'),
 ('term', 'zip_code'),
 ('title', 'application_type'),
 ('title', 'emp_length'),
 ('title', 'grade'),
 ('title', 'home_ownership'),
 ('title', 'initial_list_status'),
 ('title', 'loan_status'),
 ('title', 'purpose'),
 ('title', 'sub_grade'),
 ('title', 'term'),
 ('title', 'title'),
 ('title', 'verification_status'),
 ('title', 'zip_code'),
 ('verification_status', 'application_type'),
 ('verification_status', 'emp_length'),
 ('verification_status', 'emp_title'),
 ('verification_status', 'grade'),
 ('verification_status', 'home_ownership'),
 ('verification_status', 'initial_list_status'),
 ('verification_status', 'loan_status'),
 ('verification_status', 'purpose'),
 ('verification_status', 'sub_grade'),
 ('verification_status', 'term'),
 ('verification_status', 'title'),
 ('verification_status', 'verification_status'),
 ('verification_status', 'zip_code'),
 ('zip_code', 'application_type'),
 ('zip_code', 'emp_length'),
 ('zip_code', 'emp_title'),
 ('zip_code', 'grade'),
 ('zip_code', 'home_ownership'),
 ('zip_code', 'initial_list_status'),
 ('zip_code', 'loan_status'),
 ('zip_code', 'purpose'),
 ('zip_code', 'sub_grade'),
 ('zip_code', 'term'),
 ('zip_code', 'title'),
 ('zip_code', 'verification_status'),
 ('zip_code', 'zip_code')}
In [79]:
# def cramers_v(x, y):
#     confusion_matrix = pd.crosstab(x,y)
#     chi2 = ss.chi2_contingency(confusion_matrix)[0]
#     n = confusion_matrix.sum().sum()
#     phi2 = chi2/n
#     r,k = confusion_matrix.shape
#     phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
#     rcorr = r-((r-1)**2)/(n-1)
#     kcorr = k-((k-1)**2)/(n-1)
#     return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# result = []

# for i in cat_combi_1:
#     if i[0] != i[1]:
#         print(i[0], i[1])
#         result.append((i[0],i[1], cramers_v(train_df4[i[0]], train_df4[i[1]])))
#     else:
#         pass
In [80]:
# chitest = pd.DataFrame(result, columns = ['var1', 'var2', 'coeff'] )
# chitest_pvt = chitest.pivot(index='var1', columns='var2', values='coeff').round(3)
# fig = px.imshow(chitest_pvt, color_continuous_scale = 'plasma', text_auto = True, aspect = 'auto')
# fig.show()

From the heat map, we can deduce strong positive correlation (>=0.5) between different variables:

  • grade ↔ subgrade
  • title ↔ purpose

There is an absence of strong negative correlation between variables

🔄 Feature Engineering¶

In [111]:
train_df4_0 = train_df4[train_df4['loan_status']=='0'].sample(frac = 0.25, ignore_index = True)
train_df4_1 = train_df4[train_df4['loan_status']=='1']
train_df5 = pd.concat([train_df4_0, train_df4_1], axis = 0, ignore_index = True)
len(train_df5)
Out[111]:
1028864

3b. Treating of Categorical Features¶

In [112]:
train_df5['term'] = train_df5['term'].str.strip().str[:2]
train_df5['term'].unique()
Out[112]:
array(['36', '60'], dtype=object)

Info about US Postal Code:

  • 1st Digit: Group of states or a specific region within a state
  • 2nd Digit: Narrows down the area within the state or region
  • 3rd Digit Further refines the area, typically specifying a city or a group of nearby cities.
  • 4th digit indicates a specific geographic segment within the city or region.
  • 5th digit, also known as the ZIP code suffix, represents a specific post office or delivery area

To reduce the cardinality of zip code column, so that the com has enough RAM to run a smaller dataset after one-hot encoding, reduce it to 1st two digits

In [113]:
train_df5['zip_code'] = train_df5['zip_code'].str[:2]
train_df5['zip_code'].unique()
Out[113]:
array(['02', '78', '63', '95', '12', '11', '35', '33', '72', '80', '44',
       '76', '92', '10', '67', '37', '23', '29', '91', '15', '97', '08',
       '49', '20', '53', '94', '21', '22', '60', '85', '03', '05', '83',
       '30', '93', '55', '57', '24', '32', '99', '90', '34', '77', '79',
       '18', '48', '81', '68', '19', '54', '17', '87', '96', '27', '89',
       '75', '01', '61', '28', '46', '98', '26', '36', '43', '07', '64',
       '41', '59', '38', '66', '45', '70', '06', '62', '40', '73', '14',
       '74', '47', '84', '31', '42', '04', '13', '39', '86', '65', '82',
       '88', '71', '58', '16', '25', '56', '69', '52', '09', '50', '00',
       nan, '51'], dtype=object)

iii) Date Extraction¶

In [114]:
train_df5['Year_cr_line'] = pd.to_datetime(train_df5['earliest_cr_line'], format = '%b-%Y').dt.year.astype(str)
train_df5['Year_cr_line']
Out[114]:
0          2000
1          1999
2          2000
3          2007
4          2001
           ... 
1028859    2002
1028860    2005
1028861    1995
1028862    1993
1028863    1997
Name: Year_cr_line, Length: 1028864, dtype: object

There seems to be missing data for the earlier parts of the years for both issue_d and earliest_cr_line, hence to drop these columns for incomplete covereage

i) Label Encoding for Ordinal Features¶

In [115]:
train_df6 = train_df5.sort_values(['grade'])
train_df6['grade'].unique()

labelencoder = LabelEncoder()

train_df6['grade'] = labelencoder.fit_transform(train_df6['grade'])
train_df6
Out[115]:
loan_amnt term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status loan_status purpose title zip_code dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status application_type mort_acc pub_rec_bankruptcies Year_cr_line
481659 27650.0 36 8.19 868.88 0 A4 Audit Consultant 5 years OWN 90000.0 Source Verified 0 debt_consolidation Debt consolidation 21 29.08 1997-04-01 12.0 0.0 14096.0 31.6 24.0 w Individual 1.0 0.0 1997
369318 9000.0 36 10.81 293.84 0 A5 Facility Manager 4 years RENT 53000.0 Not Verified 0 debt_consolidation Debt consolidation 23 26.86 2013-01-01 29.0 0.0 18990.0 34.7 38.0 w Individual 0.0 0.0 2013
173152 24000.0 60 8.19 488.82 0 A4 DSR 10+ years MORTGAGE 94000.0 Source Verified 0 debt_consolidation Debt consolidation 71 23.38 1993-12-01 11.0 0.0 21936.0 28.5 24.0 w Joint App 3.0 0.0 1993
173153 12500.0 36 6.49 383.06 0 A2 Marketing Manager 5 years MORTGAGE 69500.0 Not Verified 0 debt_consolidation Debt consolidation 33 14.50 2005-01-01 14.0 0.0 14163.0 32.5 28.0 w Individual 1.0 0.0 2005
902087 5600.0 36 7.89 175.20 0 A5 Legal Assistant 9 years MORTGAGE 55000.0 Not Verified 1 major_purchase Major purchase 90 13.11 1996-06-01 5.0 0.0 4771.0 71.2 21.0 w Individual 2.0 0.0 1996
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
418365 11525.0 60 28.99 365.76 6 G5 Document control < 1 year RENT 28800.0 Source Verified 0 debt_consolidation Debt consolidation 91 40.08 2007-09-01 14.0 0.0 14701.0 53.7 17.0 f Individual 0.0 0.0 2007
802582 30000.0 60 25.80 894.67 6 G1 Director of clinic operations 2 years MORTGAGE 120000.0 Verified 1 debt_consolidation Debt consolidation 14 16.12 2001-09-01 14.0 0.0 18020.0 49.6 46.0 f Individual 3.0 0.0 2001
130405 35000.0 60 30.79 1149.42 6 G1 operator 10+ years MORTGAGE 80000.0 Verified 0 debt_consolidation Debt consolidation 75 17.48 2006-09-01 9.0 0.0 950.0 25.0 22.0 w Individual 5.0 0.0 2006
861165 21000.0 60 26.77 638.36 6 G1 DIRECTOR OF OPERATIONS 2 years RENT 105000.0 Verified 1 vacation Vacation 91 5.03 2008-03-01 13.0 0.0 3053.0 38.6 16.0 f Individual 0.0 0.0 2008
409828 30000.0 60 25.80 894.67 6 G1 Data Systems Project Director 3 years RENT 92851.2 Verified 0 debt_consolidation Debt consolidation 85 12.73 2002-10-01 6.0 0.0 21961.0 88.2 13.0 w Individual 4.0 0.0 2002

1028864 rows × 27 columns

v) Dropping of Unnecessary Categorical Features¶

  • sub_grade is the subset of grade, hence grade to be kept and discard sub_grade which may cause multicollinearity
  • emp_title, title to be removed due to high cardinality
  • emp_length, initial_list_status to be removed due to similar distribution of loan status across different employment length
  • `
In [116]:
pseudo_df = train_df6.drop([ 'sub_grade', 'emp_title', 'emp_length', 'initial_list_status', 'title', 'earliest_cr_line'], axis =1)
pseudo_df.select_dtypes('object').columns
Out[116]:
Index(['term', 'home_ownership', 'verification_status', 'loan_status',
       'purpose', 'zip_code', 'application_type', 'Year_cr_line'],
      dtype='object')

vi) One Hot Encoding¶

In [117]:
dummies = pseudo_df.select_dtypes('object').columns.tolist()
dummies.remove('loan_status') # have to do on separate row, cant put concurrently abv. remove mutates the list in-place
dummies
Out[117]:
['term',
 'home_ownership',
 'verification_status',
 'purpose',
 'zip_code',
 'application_type',
 'Year_cr_line']
In [118]:
train_df7 = train_df6.copy()
train_df7['verification_status'] = np.where(train_df7['verification_status'] == 'Source Verified', 'Verified', train_df7['verification_status'])
train_df7['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace = True)

train_df7['home_ownership'] = train_df7['home_ownership'].str.title()
train_df7['application_type'] = train_df7['application_type'].str.title()

for i in dummies:
    train_df7[i] = train_df7[i].str.title()
    print(train_df7[i].unique())
['36' '60']
['Own' 'Rent' 'Mortgage' 'Other']
['Verified' 'Not Verified']
['Debt_Consolidation' 'Major_Purchase' 'Credit_Card' 'Other'
 'Home_Improvement' 'Moving' 'Car' 'Vacation' 'Medical' 'House'
 'Small_Business' 'Renewable_Energy' 'Wedding' 'Educational']
['21' '23' '71' '33' '90' '76' '34' '11' '30' '32' '06' '44' '14' '40'
 '59' '72' '95' '02' '94' '75' '49' '73' '80' '93' '18' '48' '42' '55'
 '53' '62' '27' '85' '04' '43' '70' '10' '89' '08' '20' '47' '28' '46'
 '99' '01' '60' '79' '38' '37' '54' '64' '61' '92' '98' '67' '91' '03'
 '35' '77' '24' '07' '97' '17' '13' '29' '84' '96' '78' '19' '15' '66'
 '31' '86' '22' '36' '45' '41' '63' '58' '05' '82' '25' '81' '68' '56'
 '26' '65' '16' '57' '87' '12' '39' '74' '69' '83' '88' '09' '50' '00'
 '52' '51' nan]
['Individual' 'Joint App']
['1997' '2013' '1993' '2005' '1996' '2001' '1991' '1998' '1976' '1999'
 '2007' '2002' '1968' '1984' '1995' '2004' '1994' '2000' '2012' '1987'
 '2003' '2006' '2008' '1986' '2011' '2014' '1983' '1990' '1992' '1985'
 '1978' '2010' '1979' '1982' '1989' '1981' '2009' '2015' '1988' '1967'
 '1977' '1975' '2016' '1971' '1980' '1972' '1962' '1973' '1963' '1970'
 '1974' '1969' '1965' '1961' '1952' '1966' '1959' '1964' '2017' '1960'
 '1956' '1958' '1955' '1957' '1944' '1953' '1948' '1950' '1951' '1954'
 '1946' '1949' '1934']
In [119]:
train_df8 = pd.get_dummies(train_df7, columns = dummies, drop_first = True)
# train_df4.drop(dummies, axis  = 1, inplace = True)
train_df8
Out[119]:
loan_amnt int_rate installment grade sub_grade emp_title emp_length annual_inc loan_status title dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status mort_acc pub_rec_bankruptcies term_60 home_ownership_Other home_ownership_Own home_ownership_Rent verification_status_Verified purpose_Credit_Card purpose_Debt_Consolidation purpose_Educational purpose_Home_Improvement purpose_House purpose_Major_Purchase purpose_Medical purpose_Moving purpose_Other purpose_Renewable_Energy purpose_Small_Business purpose_Vacation purpose_Wedding zip_code_01 zip_code_02 zip_code_03 zip_code_04 zip_code_05 zip_code_06 zip_code_07 zip_code_08 zip_code_09 zip_code_10 zip_code_11 zip_code_12 zip_code_13 zip_code_14 zip_code_15 zip_code_16 zip_code_17 zip_code_18 zip_code_19 zip_code_20 zip_code_21 zip_code_22 zip_code_23 zip_code_24 zip_code_25 zip_code_26 zip_code_27 zip_code_28 zip_code_29 zip_code_30 zip_code_31 zip_code_32 zip_code_33 zip_code_34 zip_code_35 zip_code_36 zip_code_37 zip_code_38 zip_code_39 zip_code_40 zip_code_41 zip_code_42 zip_code_43 zip_code_44 zip_code_45 zip_code_46 zip_code_47 zip_code_48 zip_code_49 zip_code_50 zip_code_51 zip_code_52 zip_code_53 zip_code_54 zip_code_55 zip_code_56 zip_code_57 zip_code_58 zip_code_59 zip_code_60 zip_code_61 zip_code_62 zip_code_63 zip_code_64 zip_code_65 zip_code_66 zip_code_67 zip_code_68 zip_code_69 zip_code_70 zip_code_71 zip_code_72 zip_code_73 zip_code_74 zip_code_75 zip_code_76 zip_code_77 zip_code_78 zip_code_79 zip_code_80 zip_code_81 zip_code_82 zip_code_83 zip_code_84 zip_code_85 zip_code_86 zip_code_87 zip_code_88 zip_code_89 zip_code_90 zip_code_91 zip_code_92 zip_code_93 zip_code_94 zip_code_95 zip_code_96 zip_code_97 zip_code_98 zip_code_99 application_type_Joint App Year_cr_line_1944 Year_cr_line_1946 Year_cr_line_1948 Year_cr_line_1949 Year_cr_line_1950 Year_cr_line_1951 Year_cr_line_1952 Year_cr_line_1953 Year_cr_line_1954 Year_cr_line_1955 Year_cr_line_1956 Year_cr_line_1957 Year_cr_line_1958 Year_cr_line_1959 Year_cr_line_1960 Year_cr_line_1961 Year_cr_line_1962 Year_cr_line_1963 Year_cr_line_1964 Year_cr_line_1965 Year_cr_line_1966 Year_cr_line_1967 Year_cr_line_1968 Year_cr_line_1969 Year_cr_line_1970 Year_cr_line_1971 Year_cr_line_1972 Year_cr_line_1973 Year_cr_line_1974 Year_cr_line_1975 Year_cr_line_1976 Year_cr_line_1977 Year_cr_line_1978 Year_cr_line_1979 Year_cr_line_1980 Year_cr_line_1981 Year_cr_line_1982 Year_cr_line_1983 Year_cr_line_1984 Year_cr_line_1985 Year_cr_line_1986 Year_cr_line_1987 Year_cr_line_1988 Year_cr_line_1989 Year_cr_line_1990 Year_cr_line_1991 Year_cr_line_1992 Year_cr_line_1993 Year_cr_line_1994 Year_cr_line_1995 Year_cr_line_1996 Year_cr_line_1997 Year_cr_line_1998 Year_cr_line_1999 Year_cr_line_2000 Year_cr_line_2001 Year_cr_line_2002 Year_cr_line_2003 Year_cr_line_2004 Year_cr_line_2005 Year_cr_line_2006 Year_cr_line_2007 Year_cr_line_2008 Year_cr_line_2009 Year_cr_line_2010 Year_cr_line_2011 Year_cr_line_2012 Year_cr_line_2013 Year_cr_line_2014 Year_cr_line_2015 Year_cr_line_2016 Year_cr_line_2017
481659 27650.0 8.19 868.88 0 A4 Audit Consultant 5 years 90000.0 0 Debt consolidation 29.08 1997-04-01 12.0 0.0 14096.0 31.6 24.0 w 1.0 0.0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
369318 9000.0 10.81 293.84 0 A5 Facility Manager 4 years 53000.0 0 Debt consolidation 26.86 2013-01-01 29.0 0.0 18990.0 34.7 38.0 w 0.0 0.0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
173152 24000.0 8.19 488.82 0 A4 DSR 10+ years 94000.0 0 Debt consolidation 23.38 1993-12-01 11.0 0.0 21936.0 28.5 24.0 w 3.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
173153 12500.0 6.49 383.06 0 A2 Marketing Manager 5 years 69500.0 0 Debt consolidation 14.50 2005-01-01 14.0 0.0 14163.0 32.5 28.0 w 1.0 0.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
902087 5600.0 7.89 175.20 0 A5 Legal Assistant 9 years 55000.0 1 Major purchase 13.11 1996-06-01 5.0 0.0 4771.0 71.2 21.0 w 2.0 0.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
418365 11525.0 28.99 365.76 6 G5 Document control < 1 year 28800.0 0 Debt consolidation 40.08 2007-09-01 14.0 0.0 14701.0 53.7 17.0 f 0.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
802582 30000.0 25.80 894.67 6 G1 Director of clinic operations 2 years 120000.0 1 Debt consolidation 16.12 2001-09-01 14.0 0.0 18020.0 49.6 46.0 f 3.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
130405 35000.0 30.79 1149.42 6 G1 operator 10+ years 80000.0 0 Debt consolidation 17.48 2006-09-01 9.0 0.0 950.0 25.0 22.0 w 5.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
861165 21000.0 26.77 638.36 6 G1 DIRECTOR OF OPERATIONS 2 years 105000.0 1 Vacation 5.03 2008-03-01 13.0 0.0 3053.0 38.6 16.0 f 0.0 0.0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
409828 30000.0 25.80 894.67 6 G1 Data Systems Project Director 3 years 92851.2 0 Debt consolidation 12.73 2002-10-01 6.0 0.0 21961.0 88.2 13.0 w 4.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

1028864 rows × 210 columns

3c. Treating of Numeric Features¶

In [120]:
train_df8.select_dtypes(['int64','float64']).columns
Out[120]:
Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc',
       'pub_rec_bankruptcies'],
      dtype='object')

annual_inc¶

As mentioned in Section 2a, not logical that borrower could borrow money with annual income = 0. Hence, to impute using median

Additionally, the annual income is extremely right-skewed with a wide range, hence to execute log transform

In [121]:
train_df8[train_df8['annual_inc']==0]
Out[121]:
loan_amnt int_rate installment grade sub_grade emp_title emp_length annual_inc loan_status title dti earliest_cr_line open_acc pub_rec revol_bal revol_util total_acc initial_list_status mort_acc pub_rec_bankruptcies term_60 home_ownership_Other home_ownership_Own home_ownership_Rent verification_status_Verified purpose_Credit_Card purpose_Debt_Consolidation purpose_Educational purpose_Home_Improvement purpose_House purpose_Major_Purchase purpose_Medical purpose_Moving purpose_Other purpose_Renewable_Energy purpose_Small_Business purpose_Vacation purpose_Wedding zip_code_01 zip_code_02 zip_code_03 zip_code_04 zip_code_05 zip_code_06 zip_code_07 zip_code_08 zip_code_09 zip_code_10 zip_code_11 zip_code_12 zip_code_13 zip_code_14 zip_code_15 zip_code_16 zip_code_17 zip_code_18 zip_code_19 zip_code_20 zip_code_21 zip_code_22 zip_code_23 zip_code_24 zip_code_25 zip_code_26 zip_code_27 zip_code_28 zip_code_29 zip_code_30 zip_code_31 zip_code_32 zip_code_33 zip_code_34 zip_code_35 zip_code_36 zip_code_37 zip_code_38 zip_code_39 zip_code_40 zip_code_41 zip_code_42 zip_code_43 zip_code_44 zip_code_45 zip_code_46 zip_code_47 zip_code_48 zip_code_49 zip_code_50 zip_code_51 zip_code_52 zip_code_53 zip_code_54 zip_code_55 zip_code_56 zip_code_57 zip_code_58 zip_code_59 zip_code_60 zip_code_61 zip_code_62 zip_code_63 zip_code_64 zip_code_65 zip_code_66 zip_code_67 zip_code_68 zip_code_69 zip_code_70 zip_code_71 zip_code_72 zip_code_73 zip_code_74 zip_code_75 zip_code_76 zip_code_77 zip_code_78 zip_code_79 zip_code_80 zip_code_81 zip_code_82 zip_code_83 zip_code_84 zip_code_85 zip_code_86 zip_code_87 zip_code_88 zip_code_89 zip_code_90 zip_code_91 zip_code_92 zip_code_93 zip_code_94 zip_code_95 zip_code_96 zip_code_97 zip_code_98 zip_code_99 application_type_Joint App Year_cr_line_1944 Year_cr_line_1946 Year_cr_line_1948 Year_cr_line_1949 Year_cr_line_1950 Year_cr_line_1951 Year_cr_line_1952 Year_cr_line_1953 Year_cr_line_1954 Year_cr_line_1955 Year_cr_line_1956 Year_cr_line_1957 Year_cr_line_1958 Year_cr_line_1959 Year_cr_line_1960 Year_cr_line_1961 Year_cr_line_1962 Year_cr_line_1963 Year_cr_line_1964 Year_cr_line_1965 Year_cr_line_1966 Year_cr_line_1967 Year_cr_line_1968 Year_cr_line_1969 Year_cr_line_1970 Year_cr_line_1971 Year_cr_line_1972 Year_cr_line_1973 Year_cr_line_1974 Year_cr_line_1975 Year_cr_line_1976 Year_cr_line_1977 Year_cr_line_1978 Year_cr_line_1979 Year_cr_line_1980 Year_cr_line_1981 Year_cr_line_1982 Year_cr_line_1983 Year_cr_line_1984 Year_cr_line_1985 Year_cr_line_1986 Year_cr_line_1987 Year_cr_line_1988 Year_cr_line_1989 Year_cr_line_1990 Year_cr_line_1991 Year_cr_line_1992 Year_cr_line_1993 Year_cr_line_1994 Year_cr_line_1995 Year_cr_line_1996 Year_cr_line_1997 Year_cr_line_1998 Year_cr_line_1999 Year_cr_line_2000 Year_cr_line_2001 Year_cr_line_2002 Year_cr_line_2003 Year_cr_line_2004 Year_cr_line_2005 Year_cr_line_2006 Year_cr_line_2007 Year_cr_line_2008 Year_cr_line_2009 Year_cr_line_2010 Year_cr_line_2011 Year_cr_line_2012 Year_cr_line_2013 Year_cr_line_2014 Year_cr_line_2015 Year_cr_line_2016 Year_cr_line_2017
507749 7000.0 6.71 215.22 0 A3 NaN NaN 0.0 0 Medical expenses 19.29648 1987-12-01 12.0 1.0 5451.0 16.1 26.0 w 0.0 1.0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
368291 7300.0 6.11 222.45 0 A1 NaN NaN 0.0 0 Credit card refinancing 19.29648 2007-04-01 11.0 0.0 6608.0 13.2 11.0 w 0.0 0.0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
173656 23000.0 6.46 704.51 0 A1 NaN NaN 0.0 0 Debt consolidation 19.29648 2014-06-01 3.0 0.0 16274.0 51.7 4.0 w 0.0 0.0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
623024 20000.0 7.21 619.47 0 A3 NaN NaN 0.0 0 Debt consolidation 19.29648 2004-09-01 6.0 1.0 400.0 2.8 12.0 w 1.0 1.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
179870 22200.0 6.46 680.01 0 A1 NaN NaN 0.0 0 Debt consolidation 19.29648 2006-07-01 6.0 0.0 4479.0 18.1 12.0 w 0.0 0.0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
739220 24850.0 30.79 816.09 6 G1 NaN NaN 0.0 1 Debt consolidation 19.29648 2006-12-01 3.0 0.0 8134.0 90.4 12.0 w 0.0 0.0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
345029 35000.0 30.79 1149.42 6 G1 NaN NaN 0.0 0 Debt consolidation 19.29648 2007-10-01 5.0 0.0 11878.0 55.2 8.0 w 0.0 0.0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
703101 10000.0 30.79 328.41 6 G1 NaN NaN 0.0 1 Business 19.29648 2007-05-01 8.0 0.0 4894.0 99.9 13.0 w 2.0 0.0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
226315 10000.0 30.79 328.41 6 G1 NaN NaN 0.0 0 Other 19.29648 2014-11-01 6.0 0.0 2398.0 28.5 9.0 w 0.0 0.0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
681917 30000.0 30.84 986.14 6 G2 NaN NaN 0.0 1 Debt consolidation 19.29648 2002-10-01 4.0 0.0 4071.0 33.9 15.0 w 1.0 0.0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

967 rows × 210 columns

In [122]:
train_df8['annual_inc'].replace(0, train_df8['annual_inc'].median(), inplace = True)
if len(train_df8[train_df8['annual_inc']==0]) == 0:
    print("Imputation Successful")
else:
    raise Exception("Imputation Unsucessful")
Imputation Successful
In [123]:
train_df8['annual_inc_log'] = (train_df8['annual_inc']).transform(np.log) # no need to +1 as 0 has been imputed
train_df8[['annual_inc_log', 'annual_inc']]
Out[123]:
annual_inc_log annual_inc
481659 11.407565 90000.0
369318 10.878047 53000.0
173152 11.451050 94000.0
173153 11.149082 69500.0
902087 10.915088 55000.0
... ... ...
418365 10.268131 28800.0
802582 11.695247 120000.0
130405 11.289782 80000.0
861165 11.561716 105000.0
409828 11.438753 92851.2

1028864 rows × 2 columns

total_acc & open_acc¶

total_acc is highly correlated with open_acc as shown in the heatmap under Section 2c. Hence, one of the columns has to be removed to minimise error arising from multicollinearity. As total_acc distribution is more than than open_acc, open_acc will be dropped.

In [124]:
train_df8.drop_duplicates(['total_acc', 'open_acc'])[['total_acc', 'open_acc']].sort_values(['total_acc', 'open_acc']).head(10)
Out[124]:
total_acc open_acc
639629 1.0 1.0
343086 2.0 0.0
180202 2.0 1.0
173328 2.0 2.0
94050 3.0 0.0
184014 3.0 1.0
173611 3.0 2.0
173192 3.0 3.0
640516 3.0 4.0
253214 4.0 0.0

pub_rec_bankruptcies & pub_rec¶

Similar to above, pub_rec_bankruptcies and pub_rec are highly correlated and similar distribution as mentioned in section 2d, pub_rec will be selected as derogatory records can be an early tell-tale signs to determine whether borrower will default

In [125]:
train_df8.drop_duplicates(['pub_rec_bankruptcies', 'pub_rec'])[['pub_rec', 'pub_rec_bankruptcies']].sort_values(['pub_rec', 'pub_rec_bankruptcies']).head(10)
Out[125]:
pub_rec pub_rec_bankruptcies
481659 0.0 0.0
369405 1.0 0.0
173111 1.0 1.0
369826 2.0 0.0
170496 2.0 1.0
172988 2.0 2.0
371645 3.0 0.0
903320 3.0 1.0
374435 3.0 2.0
176749 3.0 3.0

loan_amt & installment¶

To drop installment, for the same reason as above

In [126]:
train_df8.drop_duplicates(['loan_amnt', 'installment'])[['loan_amnt', 'installment']].sort_values(['loan_amnt', 'installment']).head(10)
Out[126]:
loan_amnt installment
639908 500.0 15.67
639788 500.0 15.75
640501 500.0 15.76
640446 500.0 15.91
640404 500.0 16.73
640220 500.0 16.85
640150 550.0 18.28
640281 600.0 19.62
640367 600.0 19.80
639906 600.0 20.29

dti¶

Similar to annual_inc, does not make sense to have 0 for dti.

As the dti data is skewed with range from 0.01 (ignoring 0) to 9999 (which may be an error), these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 99.3%

In [127]:
(train_df8['dti']==0).sum()
Out[127]:
845
In [128]:
(train_df8['dti']==9999).sum()
Out[128]:
0
In [129]:
lower_lim =train_df8['dti'].quantile(0.01)
upper_lim =train_df8['dti'].quantile(0.99)


train_df9 = train_df8[(train_df8['dti'] < upper_lim) & (train_df8['dti'] > lower_lim)]
print("max:", np.max(train_df8['dti']), "\nmin:", np.min(train_df8['dti']))
print('Number of Records Removed:', (len(train_df8)-len(train_df9)))
max: 999.0 
min: -1.0
Number of Records Removed: 20613

revol_bal¶

Similar to annual_inc, does not make sense to have 0 for revol_bal.

As the revol_bal data is skewed with range from 1 (ignoring 0) to 1298783, these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 99.5%

In [130]:
lower_lim =train_df9['revol_bal'].quantile(0.01)
upper_lim =train_df9['revol_bal'].quantile(0.99)


train_df10 = train_df8[(train_df9['revol_bal'] < upper_lim) & (train_df8['revol_bal'] > lower_lim)]
print("max:", np.max(train_df10['revol_bal']), "\nmin:", np.min(train_df10['revol_bal']))
print('Number of Records Removed:', (len(train_df9)-len(train_df10)))
max: 95203.0 
min: 165.0
Number of Records Removed: 20167

revol_util¶

Similar to annual_inc, does not make sense to have 0 for revol_util.

As the revol_bal data is skewed with range from 0.01 (ignoring 0) to 892.3, these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 97.8%

In [131]:
lower_lim =train_df10['revol_util'].quantile(0.01)
upper_lim =train_df10['revol_util'].quantile(0.99)


train_df11 = train_df10[(train_df10['revol_util'] < upper_lim) & (train_df10['revol_util'] > lower_lim)]
print("max:", np.max(train_df11['revol_util']), "\nmin:", np.min(train_df11['revol_util']))
print('Number of Records Removed:', (len(train_df8)-len(train_df9)))
max: 98.1 
min: 3.5
Number of Records Removed: 20613
In [132]:
train_df12 = train_df11.drop([ 'sub_grade', 'emp_title', 'emp_length', 'initial_list_status', 'title', 'earliest_cr_line'], axis =1)
train_df12
Out[132]:
loan_amnt int_rate installment grade annual_inc loan_status dti open_acc pub_rec revol_bal revol_util total_acc mort_acc pub_rec_bankruptcies term_60 home_ownership_Other home_ownership_Own home_ownership_Rent verification_status_Verified purpose_Credit_Card purpose_Debt_Consolidation purpose_Educational purpose_Home_Improvement purpose_House purpose_Major_Purchase purpose_Medical purpose_Moving purpose_Other purpose_Renewable_Energy purpose_Small_Business purpose_Vacation purpose_Wedding zip_code_01 zip_code_02 zip_code_03 zip_code_04 zip_code_05 zip_code_06 zip_code_07 zip_code_08 zip_code_09 zip_code_10 zip_code_11 zip_code_12 zip_code_13 zip_code_14 zip_code_15 zip_code_16 zip_code_17 zip_code_18 zip_code_19 zip_code_20 zip_code_21 zip_code_22 zip_code_23 zip_code_24 zip_code_25 zip_code_26 zip_code_27 zip_code_28 zip_code_29 zip_code_30 zip_code_31 zip_code_32 zip_code_33 zip_code_34 zip_code_35 zip_code_36 zip_code_37 zip_code_38 zip_code_39 zip_code_40 zip_code_41 zip_code_42 zip_code_43 zip_code_44 zip_code_45 zip_code_46 zip_code_47 zip_code_48 zip_code_49 zip_code_50 zip_code_51 zip_code_52 zip_code_53 zip_code_54 zip_code_55 zip_code_56 zip_code_57 zip_code_58 zip_code_59 zip_code_60 zip_code_61 zip_code_62 zip_code_63 zip_code_64 zip_code_65 zip_code_66 zip_code_67 zip_code_68 zip_code_69 zip_code_70 zip_code_71 zip_code_72 zip_code_73 zip_code_74 zip_code_75 zip_code_76 zip_code_77 zip_code_78 zip_code_79 zip_code_80 zip_code_81 zip_code_82 zip_code_83 zip_code_84 zip_code_85 zip_code_86 zip_code_87 zip_code_88 zip_code_89 zip_code_90 zip_code_91 zip_code_92 zip_code_93 zip_code_94 zip_code_95 zip_code_96 zip_code_97 zip_code_98 zip_code_99 application_type_Joint App Year_cr_line_1944 Year_cr_line_1946 Year_cr_line_1948 Year_cr_line_1949 Year_cr_line_1950 Year_cr_line_1951 Year_cr_line_1952 Year_cr_line_1953 Year_cr_line_1954 Year_cr_line_1955 Year_cr_line_1956 Year_cr_line_1957 Year_cr_line_1958 Year_cr_line_1959 Year_cr_line_1960 Year_cr_line_1961 Year_cr_line_1962 Year_cr_line_1963 Year_cr_line_1964 Year_cr_line_1965 Year_cr_line_1966 Year_cr_line_1967 Year_cr_line_1968 Year_cr_line_1969 Year_cr_line_1970 Year_cr_line_1971 Year_cr_line_1972 Year_cr_line_1973 Year_cr_line_1974 Year_cr_line_1975 Year_cr_line_1976 Year_cr_line_1977 Year_cr_line_1978 Year_cr_line_1979 Year_cr_line_1980 Year_cr_line_1981 Year_cr_line_1982 Year_cr_line_1983 Year_cr_line_1984 Year_cr_line_1985 Year_cr_line_1986 Year_cr_line_1987 Year_cr_line_1988 Year_cr_line_1989 Year_cr_line_1990 Year_cr_line_1991 Year_cr_line_1992 Year_cr_line_1993 Year_cr_line_1994 Year_cr_line_1995 Year_cr_line_1996 Year_cr_line_1997 Year_cr_line_1998 Year_cr_line_1999 Year_cr_line_2000 Year_cr_line_2001 Year_cr_line_2002 Year_cr_line_2003 Year_cr_line_2004 Year_cr_line_2005 Year_cr_line_2006 Year_cr_line_2007 Year_cr_line_2008 Year_cr_line_2009 Year_cr_line_2010 Year_cr_line_2011 Year_cr_line_2012 Year_cr_line_2013 Year_cr_line_2014 Year_cr_line_2015 Year_cr_line_2016 Year_cr_line_2017 annual_inc_log
481659 27650.0 8.19 868.88 0 90000.0 0 29.08 12.0 0.0 14096.0 31.6 24.0 1.0 0.0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.407565
369318 9000.0 10.81 293.84 0 53000.0 0 26.86 29.0 0.0 18990.0 34.7 38.0 0.0 0.0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 10.878047
173152 24000.0 8.19 488.82 0 94000.0 0 23.38 11.0 0.0 21936.0 28.5 24.0 3.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.451050
173153 12500.0 6.49 383.06 0 69500.0 0 14.50 14.0 0.0 14163.0 32.5 28.0 1.0 0.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 11.149082
902087 5600.0 7.89 175.20 0 55000.0 1 13.11 5.0 0.0 4771.0 71.2 21.0 2.0 0.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.915088
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
418365 11525.0 28.99 365.76 6 28800.0 0 40.08 14.0 0.0 14701.0 53.7 17.0 0.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 10.268131
802582 30000.0 25.80 894.67 6 120000.0 1 16.12 14.0 0.0 18020.0 49.6 46.0 3.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.695247
130405 35000.0 30.79 1149.42 6 80000.0 0 17.48 9.0 0.0 950.0 25.0 22.0 5.0 0.0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 11.289782
861165 21000.0 26.77 638.36 6 105000.0 1 5.03 13.0 0.0 3053.0 38.6 16.0 0.0 0.0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 11.561716
409828 30000.0 25.80 894.67 6 92851.2 0 12.73 6.0 0.0 21961.0 88.2 13.0 4.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.438753

967893 rows × 205 columns

4.🤖 Models Building and Comparing Model Perforamance¶


OVERALL GOAL:

  • Building a machine learning model that is capable of generating predictions

Logistic Regression (baseline model w/o hyperparameter tuning)¶

Treat Imbalanced Data

In [133]:
del [train_df, train_df2, train_df3, train_df4, train_df5, train_df6, train_df7, train_df8]
In [134]:
feat_selection = train_df12.columns.tolist()
feat_selection.remove('loan_status')

# from imblearn.over_sampling import SMOTE
# from sklearn.utils import shuffle

# train_df9 = shuffle(train_df9)
# sm = SMOTE(sampling_strategy='minority', random_state= 1)
# X_sm, y_sm =  sm.fit_resample(train_df9[feat_selection], train_df9['loan_status'])
# train_df10 = pd.concat([pd.DataFrame(y_sm), pd.DataFrame(X_sm)], axis = 1)

# from imblearn.under_sampling import NearMiss

# undersample = NearMiss(version = 1, n_neighbors = 3)
# X_us, y_us = undersample.fit_resample(train_df9[feat_selection], train_df9['loan_status'])
# train_df10 = pd.concat([pd.DataFrame(y_us), pd.DataFrame(X_us)], axis = 1)
In [135]:
from sklearn.utils import shuffle
train_df13 = shuffle(train_df12, random_state = 1).reset_index()
train_df13
Out[135]:
index loan_amnt int_rate installment grade annual_inc loan_status dti open_acc pub_rec revol_bal revol_util total_acc mort_acc pub_rec_bankruptcies term_60 home_ownership_Other home_ownership_Own home_ownership_Rent verification_status_Verified purpose_Credit_Card purpose_Debt_Consolidation purpose_Educational purpose_Home_Improvement purpose_House purpose_Major_Purchase purpose_Medical purpose_Moving purpose_Other purpose_Renewable_Energy purpose_Small_Business purpose_Vacation purpose_Wedding zip_code_01 zip_code_02 zip_code_03 zip_code_04 zip_code_05 zip_code_06 zip_code_07 zip_code_08 zip_code_09 zip_code_10 zip_code_11 zip_code_12 zip_code_13 zip_code_14 zip_code_15 zip_code_16 zip_code_17 zip_code_18 zip_code_19 zip_code_20 zip_code_21 zip_code_22 zip_code_23 zip_code_24 zip_code_25 zip_code_26 zip_code_27 zip_code_28 zip_code_29 zip_code_30 zip_code_31 zip_code_32 zip_code_33 zip_code_34 zip_code_35 zip_code_36 zip_code_37 zip_code_38 zip_code_39 zip_code_40 zip_code_41 zip_code_42 zip_code_43 zip_code_44 zip_code_45 zip_code_46 zip_code_47 zip_code_48 zip_code_49 zip_code_50 zip_code_51 zip_code_52 zip_code_53 zip_code_54 zip_code_55 zip_code_56 zip_code_57 zip_code_58 zip_code_59 zip_code_60 zip_code_61 zip_code_62 zip_code_63 zip_code_64 zip_code_65 zip_code_66 zip_code_67 zip_code_68 zip_code_69 zip_code_70 zip_code_71 zip_code_72 zip_code_73 zip_code_74 zip_code_75 zip_code_76 zip_code_77 zip_code_78 zip_code_79 zip_code_80 zip_code_81 zip_code_82 zip_code_83 zip_code_84 zip_code_85 zip_code_86 zip_code_87 zip_code_88 zip_code_89 zip_code_90 zip_code_91 zip_code_92 zip_code_93 zip_code_94 zip_code_95 zip_code_96 zip_code_97 zip_code_98 zip_code_99 application_type_Joint App Year_cr_line_1944 Year_cr_line_1946 Year_cr_line_1948 Year_cr_line_1949 Year_cr_line_1950 Year_cr_line_1951 Year_cr_line_1952 Year_cr_line_1953 Year_cr_line_1954 Year_cr_line_1955 Year_cr_line_1956 Year_cr_line_1957 Year_cr_line_1958 Year_cr_line_1959 Year_cr_line_1960 Year_cr_line_1961 Year_cr_line_1962 Year_cr_line_1963 Year_cr_line_1964 Year_cr_line_1965 Year_cr_line_1966 Year_cr_line_1967 Year_cr_line_1968 Year_cr_line_1969 Year_cr_line_1970 Year_cr_line_1971 Year_cr_line_1972 Year_cr_line_1973 Year_cr_line_1974 Year_cr_line_1975 Year_cr_line_1976 Year_cr_line_1977 Year_cr_line_1978 Year_cr_line_1979 Year_cr_line_1980 Year_cr_line_1981 Year_cr_line_1982 Year_cr_line_1983 Year_cr_line_1984 Year_cr_line_1985 Year_cr_line_1986 Year_cr_line_1987 Year_cr_line_1988 Year_cr_line_1989 Year_cr_line_1990 Year_cr_line_1991 Year_cr_line_1992 Year_cr_line_1993 Year_cr_line_1994 Year_cr_line_1995 Year_cr_line_1996 Year_cr_line_1997 Year_cr_line_1998 Year_cr_line_1999 Year_cr_line_2000 Year_cr_line_2001 Year_cr_line_2002 Year_cr_line_2003 Year_cr_line_2004 Year_cr_line_2005 Year_cr_line_2006 Year_cr_line_2007 Year_cr_line_2008 Year_cr_line_2009 Year_cr_line_2010 Year_cr_line_2011 Year_cr_line_2012 Year_cr_line_2013 Year_cr_line_2014 Year_cr_line_2015 Year_cr_line_2016 Year_cr_line_2017 annual_inc_log
0 708815 2600.0 10.41 84.40 1 40000.00 1 9.39 6.0 4.0 2146.0 52.3 32.0 1.0 1.0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.596635
1 236060 16000.0 14.09 547.55 1 62000.00 0 32.18 5.0 0.0 11896.0 67.2 18.0 0.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.034890
2 292369 15000.0 14.74 354.81 2 38000.00 0 16.84 7.0 0.0 15927.0 58.1 10.0 0.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 10.545341
3 160815 3500.0 16.95 124.70 2 61000.00 0 25.65 10.0 0.0 14464.0 60.0 22.0 3.0 0.0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.018629
4 691072 16000.0 13.59 543.67 2 90000.00 1 16.99 18.0 0.0 15440.0 54.6 27.0 0.0 0.0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.407565
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
967888 1012275 12000.0 14.08 410.60 2 48100.00 1 18.34 9.0 0.0 11940.0 56.3 24.0 1.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.781037
967889 489905 35000.0 16.55 1240.03 3 75000.00 0 20.73 8.0 0.0 5446.0 93.9 20.0 5.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.225243
967890 867007 14000.0 13.33 473.95 2 102000.00 1 22.89 15.0 0.0 24866.0 67.6 40.0 3.0 0.0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.532728
967891 1019304 8950.0 14.08 306.24 2 30000.00 1 14.92 7.0 0.0 12537.0 80.0 8.0 0.0 0.0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 10.308953
967892 457634 5000.0 7.21 154.87 0 49358.39 0 23.68 11.0 0.0 2793.0 6.8 15.0 2.0 0.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.806863

967893 rows × 206 columns

Import the models of your choice
We select the features we want to use in predicting our outcome

In [136]:
reduce_memory_usage(train_df13, verbose=True)
Mem. usage decreased to 215.07 Mb (25.8% reduction)
Out[136]:
index loan_amnt int_rate installment grade annual_inc loan_status dti open_acc pub_rec revol_bal revol_util total_acc mort_acc pub_rec_bankruptcies term_60 home_ownership_Other home_ownership_Own home_ownership_Rent verification_status_Verified purpose_Credit_Card purpose_Debt_Consolidation purpose_Educational purpose_Home_Improvement purpose_House purpose_Major_Purchase purpose_Medical purpose_Moving purpose_Other purpose_Renewable_Energy purpose_Small_Business purpose_Vacation purpose_Wedding zip_code_01 zip_code_02 zip_code_03 zip_code_04 zip_code_05 zip_code_06 zip_code_07 zip_code_08 zip_code_09 zip_code_10 zip_code_11 zip_code_12 zip_code_13 zip_code_14 zip_code_15 zip_code_16 zip_code_17 zip_code_18 zip_code_19 zip_code_20 zip_code_21 zip_code_22 zip_code_23 zip_code_24 zip_code_25 zip_code_26 zip_code_27 zip_code_28 zip_code_29 zip_code_30 zip_code_31 zip_code_32 zip_code_33 zip_code_34 zip_code_35 zip_code_36 zip_code_37 zip_code_38 zip_code_39 zip_code_40 zip_code_41 zip_code_42 zip_code_43 zip_code_44 zip_code_45 zip_code_46 zip_code_47 zip_code_48 zip_code_49 zip_code_50 zip_code_51 zip_code_52 zip_code_53 zip_code_54 zip_code_55 zip_code_56 zip_code_57 zip_code_58 zip_code_59 zip_code_60 zip_code_61 zip_code_62 zip_code_63 zip_code_64 zip_code_65 zip_code_66 zip_code_67 zip_code_68 zip_code_69 zip_code_70 zip_code_71 zip_code_72 zip_code_73 zip_code_74 zip_code_75 zip_code_76 zip_code_77 zip_code_78 zip_code_79 zip_code_80 zip_code_81 zip_code_82 zip_code_83 zip_code_84 zip_code_85 zip_code_86 zip_code_87 zip_code_88 zip_code_89 zip_code_90 zip_code_91 zip_code_92 zip_code_93 zip_code_94 zip_code_95 zip_code_96 zip_code_97 zip_code_98 zip_code_99 application_type_Joint App Year_cr_line_1944 Year_cr_line_1946 Year_cr_line_1948 Year_cr_line_1949 Year_cr_line_1950 Year_cr_line_1951 Year_cr_line_1952 Year_cr_line_1953 Year_cr_line_1954 Year_cr_line_1955 Year_cr_line_1956 Year_cr_line_1957 Year_cr_line_1958 Year_cr_line_1959 Year_cr_line_1960 Year_cr_line_1961 Year_cr_line_1962 Year_cr_line_1963 Year_cr_line_1964 Year_cr_line_1965 Year_cr_line_1966 Year_cr_line_1967 Year_cr_line_1968 Year_cr_line_1969 Year_cr_line_1970 Year_cr_line_1971 Year_cr_line_1972 Year_cr_line_1973 Year_cr_line_1974 Year_cr_line_1975 Year_cr_line_1976 Year_cr_line_1977 Year_cr_line_1978 Year_cr_line_1979 Year_cr_line_1980 Year_cr_line_1981 Year_cr_line_1982 Year_cr_line_1983 Year_cr_line_1984 Year_cr_line_1985 Year_cr_line_1986 Year_cr_line_1987 Year_cr_line_1988 Year_cr_line_1989 Year_cr_line_1990 Year_cr_line_1991 Year_cr_line_1992 Year_cr_line_1993 Year_cr_line_1994 Year_cr_line_1995 Year_cr_line_1996 Year_cr_line_1997 Year_cr_line_1998 Year_cr_line_1999 Year_cr_line_2000 Year_cr_line_2001 Year_cr_line_2002 Year_cr_line_2003 Year_cr_line_2004 Year_cr_line_2005 Year_cr_line_2006 Year_cr_line_2007 Year_cr_line_2008 Year_cr_line_2009 Year_cr_line_2010 Year_cr_line_2011 Year_cr_line_2012 Year_cr_line_2013 Year_cr_line_2014 Year_cr_line_2015 Year_cr_line_2016 Year_cr_line_2017 annual_inc_log
0 708815 2600.0 10.406250 84.3750 1 40000.000000 1 9.390625 6.0 4.0 2146.0 52.312500 32.0 1.0 1.0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.593750
1 236060 16000.0 14.093750 547.5000 1 62000.000000 0 32.187500 5.0 0.0 11896.0 67.187500 18.0 0.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.031250
2 292369 15000.0 14.742188 354.7500 2 38000.000000 0 16.843750 7.0 0.0 15927.0 58.093750 10.0 0.0 0.0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 10.546875
3 160815 3500.0 16.953125 124.6875 2 61000.000000 0 25.656250 10.0 0.0 14464.0 60.000000 22.0 3.0 0.0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.015625
4 691072 16000.0 13.593750 543.5000 2 90000.000000 1 16.984375 18.0 0.0 15440.0 54.593750 27.0 0.0 0.0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.406250
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
967888 1012275 12000.0 14.078125 410.5000 2 48100.000000 1 18.343750 9.0 0.0 11940.0 56.312500 24.0 1.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.781250
967889 489905 35008.0 16.546875 1240.0000 3 75000.000000 0 20.734375 8.0 0.0 5446.0 93.875000 20.0 5.0 0.0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.226562
967890 867007 14000.0 13.328125 474.0000 2 102000.000000 1 22.890625 15.0 0.0 24866.0 67.625000 40.0 3.0 0.0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11.531250
967891 1019304 8952.0 14.078125 306.2500 2 30000.000000 1 14.921875 7.0 0.0 12537.0 80.000000 8.0 0.0 0.0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 10.312500
967892 457634 5000.0 7.210938 154.8750 0 49358.390625 0 23.687500 11.0 0.0 2793.0 6.800781 15.0 2.0 0.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10.804688

967893 rows × 206 columns

Train-Test Split</br> We split the data to facilitate the evaluation of the model

In [137]:
X = train_df13[feat_selection] # Select the features you want to use to predict the loan_status
y = train_df13['loan_status'].astype(int)

logReg = LogisticRegression(random_state=0, class_weight='balanced')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
You may opt for using k-fold cross validation as well.
In [138]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
In [139]:
logReg = logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
y_pred_proba = logReg.predict_proba(X_test)[:,1]

Confusion Matrix

In [140]:
metrics.confusion_matrix(y_test, y_pred)
Out[140]:
array([[117425,  60595],
       [ 39985,  72363]], dtype=int64)
In [141]:
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.6440969131626731
Precision 0.5442545766332225
F1 Score 0.5899814925032408

We evaluate the model's AUC using metrics.roc_auc_score()

In [142]:
print('AUC:', metrics.roc_auc_score(y_test, y_pred_proba)) 
AUC: 0.7100509095839154

4b. Random Forest (using bagging)¶

RFC on Imablanced Data: https://medium.com/sfu-cspmp/surviving-in-a-random-forest-with-imbalanced-datasets-b98b963d52eb

In [151]:
rfc = BalancedRandomForestClassifier(random_state = 5)
param = {
    'n_estimators' : [100, 200],
    'max_depth' : [2,4,8, None],
    'max_features' : ['sqrt', 'log2', None]
}

rfc_grid = RandomizedSearchCV(rfc, param_distributions = param, n_jobs = -1, scoring = 'recall')
rfc_grid.fit(X_train, y_train)

y_pred_proba= rfc_grid.predict_proba(X_test)[:,1]
y_pred = rfc_grid.predict(X_test)

print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7177697867340762
Precision 0.5085739872982638
F1 Score 0.5953290588352546

XGBoost (using boosting)¶

XGB on Imbalanced Data: https://medium.com/@rithpansanga/xgboost-and-imbalanced-datasets-strategies-for-handling-class-imbalance-cdd810b3905c

In [144]:
pos_weight = y_train.value_counts()[0]/y_train.value_counts()[1]
pos_weight
Out[144]:
1.579937017588619
In [146]:
xgboost = xgb.XGBClassifier(booster = 'gbtree', random_state = 5, scale_pos_weight= pos_weight)

xgboost.fit(X_train, y_train)

y_pred_proba= xgboost.predict_proba(X_test)[:,1]
y_pred = xgboost.predict(X_test)

print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7058247587852031
Precision 0.5524414627179691
F1 Score 0.619784359624681
In [145]:
xgboost = xgb.XGBClassifier(booster = 'gbtree', random_state = 5, scale_pos_weight= pos_weight)
param = {'learning_rate': [0.1, 0.3, 0.5], 'max_depth' : [2, 4, 8]}


xgboost_grid = RandomizedSearchCV(xgboost, param_distributions = param, n_iter = 3, n_jobs = -1, scoring = 'recall')
xgboost_grid.fit(X_train, y_train)

y_pred_proba= xgboost_grid.predict_proba(X_test)[:,1]
y_pred = xgboost_grid.predict(X_test)

print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7068750667568626
Precision 0.5530939861406136
F1 Score 0.6205999226360024
In [155]:
feat_importances = pd.Series(xgboost.feature_importances_, index=X.columns).sort_values(ascending  = False)
fig = px.bar(feat_importances)
fig.show()

Neural Network (using deep learning algorithm not taught in syllabus)¶

In [ ]:
perceptron = MLPClassifier()
param = {
    'alpha': [0.0001, 0.001],
    'learning_rate_init' : [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_iter': [500, 1000],
    'tol' : [0.001, 0.005]  
}

perceptron_grid = GridSearchCV(perceptron, param_grid = param, scoring = 'recall')
perceptron_grid.fit(X_train, y_train.ravel())

y_pred_proba= perceptron.predict_proba(X_test)[:,1]
y_predict = perceptron.predict(X_test)

print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
In [ ]:
# from sklearn.neural_network import MLPClassifier

# alpha = 0
# learing_rate = 0
# max_iteration = 0
# tol = 0
# n_iter = 0
# bestAUC = 0
# acc = 0

# print('Alpha | Learning Rate | Max Iteration | Tolerance | AUC | Accuracy')
# for a in [0.0001, 0.001]:
#     for learning_rate in range(1,5):
#         for max_iteration in [500, 1000]:
#             for tol in [0.001, 0.005]:
#                     perceptron = MLPClassifier(solver = 'adam', alpha = a,
#                                                learning_rate_init = (learning_rate/10), max_iter = max_iteration,
#                                                tol = tol,  random_state = 5)
#                     perceptron.fit(X_train, y_train.ravel())
#                     y_pred_proba= perceptron.predict_proba(X_test)[:,1]
#                     y_predict = perceptron.predict(X_test)
#                     auc_score = metrics.roc_auc_score(y_test, y_pred_proba)
#                     accuracy = metrics.accuracy_score(y_test, y_predict)
#                     print(a, learning_rate/10, max_iteration, tol, round(auc_score,4), round(accuracy, 4))
#                     if auc_score > bestAUC:
#                         bestAUC = round(auc_score,4)
#                         alpha = a
#                         learing_rate = learning_rate/10
#                         max_iteration = max_iteration
#                         tol = tol
#                         acc = round(accuracy,4)
#                     else:
#                         pass
# print(alpha, "|", learning_rate, "|", max_iteration, "|", tol, "|", bestAUC, "|", acc)